In [1]:
%pylab
%matplotlib inline


Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib

In [2]:
cd ..


/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-work

In [3]:
import sys
import numpy as np
import skimage
import cv2
import sklearn
import imp

In [4]:
import holoviews


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [5]:
import neukrill_net.utils
import neukrill_net.highlevelfeatures
import neukrill_net.stacked

In [6]:
import time

In [54]:
#%pdb


Automatic pdb calling has been turned OFF

In [7]:
settings = neukrill_net.utils.Settings('settings.json')

In [8]:
X,y = settings.flattened_train_paths(settings.classes)

In [9]:
pkl_names = ['pftas.pkl','contourhistogram.pkl','contourmoments.pkl','haralick.pkl']

In [10]:
t0 = time.time()
hlf = []
XF_list = []
for pkl_name in pkl_names:
    tmp = sklearn.externals.joblib.load('cache/'+pkl_name)
    hlf += [tmp[0]]
    XF_list += [tmp[1]]
print("Loading features took {}".format(time.time()-t0))


Loading features took 0.26828289032

In [11]:
XF = np.concatenate(XF_list,2)

In [12]:
XF.shape


Out[12]:
(1, 30336, 366)

In [13]:
XF[0,0,:]


Out[13]:
array([  9.27835052e-02,   8.24742268e-02,   1.28865979e-01,
         2.57731959e-01,   2.21649485e-01,   1.23711340e-01,
         5.67010309e-02,   1.03092784e-02,   2.57731959e-02,
         9.27835052e-02,   8.24742268e-02,   1.28865979e-01,
         2.57731959e-01,   2.21649485e-01,   1.23711340e-01,
         5.67010309e-02,   1.03092784e-02,   2.57731959e-02,
         7.88043478e-02,   8.69565217e-02,   1.35869565e-01,
         1.87500000e-01,   1.19565217e-01,   8.15217391e-02,
         1.22282609e-01,   1.03260870e-01,   8.42391304e-02,
         9.22432432e-01,   2.94594595e-02,   1.89189189e-02,
         1.70270270e-02,   1.00000000e-02,   1.62162162e-03,
         5.40540541e-04,   0.00000000e+00,   0.00000000e+00,
         9.22432432e-01,   2.94594595e-02,   1.89189189e-02,
         1.70270270e-02,   1.00000000e-02,   1.62162162e-03,
         5.40540541e-04,   0.00000000e+00,   0.00000000e+00,
         8.10550199e-01,   8.11117413e-02,   4.42427680e-02,
         3.14804311e-02,   1.95689166e-02,   1.04934770e-02,
         1.70164492e-03,   8.50822462e-04,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.03626940e-02,   0.00000000e+00,   1.55440411e-02,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   1.03626940e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.03626940e-02,   5.18134702e-03,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   5.18134702e-03,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   1.03626940e-02,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   1.03626940e-02,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   5.18134702e-03,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         5.18134702e-03,   5.18134702e-03,   0.00000000e+00,
         1.55440411e-02,   5.18134702e-03,   0.00000000e+00,
         0.00000000e+00,   5.18134702e-03,   5.18134702e-03,
         1.03626940e-02,   2.59067360e-02,   1.55440411e-02,
         1.55440411e-02,   1.03626940e-02,   1.03626940e-02,
         0.00000000e+00,   1.03626940e-02,   1.03626940e-02,
         1.03626940e-02,   3.10880821e-02,   2.07253881e-02,
         1.55440411e-02,   2.07253881e-02,   1.55440411e-02,
         2.07253881e-02,   1.03626940e-02,   2.07253881e-02,
         3.10880821e-02,   2.07253881e-02,   1.55440411e-02,
         1.55440411e-02,   2.07253881e-02,   2.07253881e-02,
         2.59067360e-02,   1.55440411e-02,   2.59067360e-02,
         3.62694301e-02,   5.18134721e-02,   5.18134721e-02,
         3.62694301e-02,   3.10880821e-02,   2.59067360e-02,
         2.07253881e-02,   2.07253881e-02,   2.07253881e-02,
         4.14507762e-02,   4.14507762e-02,   9.32642519e-02,
         1.65803105e-01,   3.52331609e-01,   8.96373034e-01,
         3.26424867e-01,   2.55193896e-01,   2.04862918e+00,
         1.44272580e+00,   2.12646011e+00,  -4.62433192e+00,
         3.15379871e+00,   3.07003271e+04,   1.39122141e+05,
         2.54433417e+05,   2.35574674e-01,   8.21080723e+06,
        -1.02133457e+05,   4.17133345e+04,   3.20081449e-01,
         8.22184495e+06,  -4.12477274e-02,   2.76398107e+03,
         1.50253777e+04,   2.12090229e-02,   6.06816516e-03,
         3.34214333e+05,   1.16103130e+07,   3.61000000e+02,
         1.04675000e+04,   1.95815874e+05,   7.90824091e-02,
         5.61860172e-02,   8.67950000e+03,   2.50394000e+05,
         7.29116057e+06,   5.92909729e-01,   1.17428038e+02,
         8.37376309e-01,   3.60378916e+02,   8.37396275e-01,
         5.04056774e+02,   1.32408763e+03,   1.77634694e+00,
         2.17408130e+00,   2.42847737e-03,   1.54530917e+00,
        -2.79362396e-01,   7.10802264e-01,   8.44503234e-03,
         8.02833026e+01,   1.08540997e-01,   6.01075856e+00,
         7.49707356e-03,   9.91907566e-02,   5.62402684e+01,
         7.00702220e-02,   9.22398670e-02,   4.07591384e-05,
         1.03418161e-01,   5.27140192e-02,   4.48146178e-02])

Naive Bayes


In [14]:
import sklearn.naive_bayes

In [15]:
clf = sklearn.naive_bayes.GaussianNB()

In [16]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=0.395761013031
Accuracy=0.185917721519
Logloss=26.1907140911

Reduce with Feature selection


In [17]:
X_new = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=45).fit_transform(XF.squeeze(0), y)

In [18]:
my_X = X_new
clf = sklearn.naive_bayes.GaussianNB()

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=0.0630669593811
Accuracy=0.308742088608
Logloss=10.6894260686

Random Forest

On original


In [19]:
import sklearn.ensemble

In [21]:
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=50.592176199
Accuracy=0.525250527426
Logloss=1.91537714433

This is similar to just the Contour Moments and Haralick features

On reduced


In [22]:
my_X = X_new

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=23.738656044
Accuracy=0.502505274262
Logloss=1.94713154469

Does slightly worse with fewer features.

Maybe it was too few?


In [23]:
my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=35.6933050156
Accuracy=0.529733649789
Logloss=1.86473985616

Hierarchical classifier


In [24]:
import neukrill_net.taxonomy

In [29]:
reload(neukrill_net.stacked)


Out[29]:
<module 'neukrill_net.stacked' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/stacked.py'>

In [36]:
reload(neukrill_net.taxonomy)


Out[36]:
<module 'neukrill_net.taxonomy' from '/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-net-tools/neukrill_net/taxonomy.py'>

In [37]:
neukrill_net.taxonomy.taxonomy


Out[37]:
{'no_class': {'artifacts': {}, 'artifacts_edge': {}},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': {},
   'chaetognath_other': {},
   'chaetognath_sagitta': {}},
  'chordate_type1': {},
  'crustaceans': {'amphipods': {},
   'copepods': {'calanoid': {'copepod_calanoid': {},
     'copepod_calanoid_eggs': {},
     'copepod_calanoid_eucalanus': {},
     'copepod_calanoid_flatheads': {},
     'copepod_calanoid_frillyAntennae': {},
     'copepod_calanoid_large': {},
     'copepod_calanoid_large_side_antennatucked': {},
     'copepod_calanoid_octomoms': {},
     'copepod_calanoid_small_longantennae': {},
     'copepod_other': {}},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': {},
     'oithona': {'copepod_cyclopoid_oithona': {},
      'copepod_cyclopoid_oithona_eggs': {}}}},
   'crustacean_other': {},
   'shrimp_like': {'decapods_all': {'decapods': {},
     'shrimp_caridean': {},
     'shrimp_sergestidae': {},
     'shrimp_zoea': {}},
    'euphausiids_all_ages': {'euphausiids': {}, 'euphausiids_young': {}},
    'shrimp-like_other': {}},
   'stomatopod': {}},
  'detritus': {'detritus_blob': {},
   'detritus_filamentous': {},
   'detritus_other': {},
   'fecal_pellet': {}},
  'diatoms': {'diatom_chain_string': {}, 'diatom_chain_tube': {}},
  'fish': {'fish_larvae_deep_body': {},
   'fish_larvae_leptocephali': {},
   'fish_larvae_medium_body': {},
   'fish_larvae_myctophids': {},
   'fish_larvae_thin_body': {},
   'fish_larvae_very_thin_body': {}},
  'gastropods': {'heteropod': {},
   'pteropods': {'pteropod_butterfly': {},
    'pteropod_theco_dev_seq': {},
    'pteropod_triangle': {}}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': {},
    'ctenophore_lobate': {},
    'cydippid': {'ctenophore_cydippid_no_tentacles': {},
     'ctenophore_cydippid_tentacles': {}}},
   'ephyra': {},
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': {},
     'hydromedusae_h15': {},
     'hydromedusae_other': {},
     'hydromedusae_partial_dark': {},
     'hydromedusae_shapeA': {},
     'hydromedusae_shapeA_sideview_small': {},
     'hydromedusae_shapeB': {},
     'hydromedusae_sideview_big': {},
     'hydromedusae_typeD': {},
     'hydromedusae_typeD_bell_and_tentacles': {},
     'hydromedusae_typeE': {},
     'hydromedusae_typeF': {}},
    'sub_hydromedusae1': {'hydromedusae_aglaura': {},
     'hydromedusae_haliscera': {},
     'hydromedusae_haliscera_small_sideview': {},
     'hydromedusae_liriope': {}},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': {},
     'hydromedusae_narco_young': {},
     'hydromedusae_narcomedusae': {},
     'hydromedusae_solmaris': {},
     'hydromedusae_solmundella': {}}},
   'jellies_tentacles': {},
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': {},
     'appendicularian_s_shape': {},
     'appendicularian_slight_curve': {},
     'appendicularian_straight': {}},
    'tunicate': {'tunicate_doliolid': {},
     'tunicate_doliolid_nurse': {},
     'tunicate_partial': {},
     'tunicate_salp': {},
     'tunicate_salp_chains': {}}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': {},
      'siphonophore_calycophoran_rocketship_young': {}},
     'siphonophore_calycophoran_abylidae': {},
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': {},
      'siphonophore_calycophoran_sphaeronectes_stem': {},
      'siphonophore_calycophoran_sphaeronectes_young': {}}},
    'physonect': {'siphonophore_physonect': {},
     'siphonophore_physonect_young': {}},
    'siphonophore_other_parts': {},
    'siphonophore_partial': {}}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': {},
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': {},
     'echinoderm_larva_pluteus_early': {},
     'echinoderm_larva_pluteus_typeC': {},
     'echinoderm_larva_pluteus_urchin': {},
     'echinopluteus': {}},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': {},
     'echinoderm_larva_seastar_brachiolaria': {}}},
   'invertebrate_larvae_other_A': {},
   'invertebrate_larvae_other_B': {},
   'tornaria_acorn_worm_larvae': {},
   'trochophore_larvae': {}},
  'polychaete': {},
  'protists': {'acantharia': {'acantharia_protist': {},
    'acantharia_protist_big_center': {},
    'acantharia_protist_halo': {}},
   'protist_noctiluca': {},
   'radiolarian': {'radiolarian_chain': {}, 'radiolarian_colony': {}},
   'sub_protists': {'protist_dark_center': {},
    'protist_fuzzy_olive': {},
    'protist_other': {},
    'protist_star': {}}},
  'trichodesmium': {'trichodesmium_bowtie': {},
   'trichodesmium_multiple': {},
   'trichodesmium_puff': {},
   'trichodesmium_tuft': {}},
  'unknown': {'unknown_blobs_and_smudges': {},
   'unknown_sticks': {},
   'unknown_unclassified': {}}}}

In [38]:
settings.classes


Out[38]:
[u'acantharia_protist',
 u'acantharia_protist_big_center',
 u'acantharia_protist_halo',
 u'amphipods',
 u'appendicularian_fritillaridae',
 u'appendicularian_slight_curve',
 u'appendicularian_s_shape',
 u'appendicularian_straight',
 u'artifacts',
 u'artifacts_edge',
 u'chaetognath_non_sagitta',
 u'chaetognath_other',
 u'chaetognath_sagitta',
 u'chordate_type1',
 u'copepod_calanoid',
 u'copepod_calanoid_eggs',
 u'copepod_calanoid_eucalanus',
 u'copepod_calanoid_flatheads',
 u'copepod_calanoid_frillyAntennae',
 u'copepod_calanoid_large',
 u'copepod_calanoid_large_side_antennatucked',
 u'copepod_calanoid_octomoms',
 u'copepod_calanoid_small_longantennae',
 u'copepod_cyclopoid_copilia',
 u'copepod_cyclopoid_oithona',
 u'copepod_cyclopoid_oithona_eggs',
 u'copepod_other',
 u'crustacean_other',
 u'ctenophore_cestid',
 u'ctenophore_cydippid_no_tentacles',
 u'ctenophore_cydippid_tentacles',
 u'ctenophore_lobate',
 u'decapods',
 u'detritus_blob',
 u'detritus_filamentous',
 u'detritus_other',
 u'diatom_chain_string',
 u'diatom_chain_tube',
 u'echinoderm_larva_pluteus_brittlestar',
 u'echinoderm_larva_pluteus_early',
 u'echinoderm_larva_pluteus_typeC',
 u'echinoderm_larva_pluteus_urchin',
 u'echinoderm_larva_seastar_bipinnaria',
 u'echinoderm_larva_seastar_brachiolaria',
 u'echinoderm_seacucumber_auricularia_larva',
 u'echinopluteus',
 u'ephyra',
 u'euphausiids',
 u'euphausiids_young',
 u'fecal_pellet',
 u'fish_larvae_deep_body',
 u'fish_larvae_leptocephali',
 u'fish_larvae_medium_body',
 u'fish_larvae_myctophids',
 u'fish_larvae_thin_body',
 u'fish_larvae_very_thin_body',
 u'heteropod',
 u'hydromedusae_aglaura',
 u'hydromedusae_bell_and_tentacles',
 u'hydromedusae_h15',
 u'hydromedusae_haliscera',
 u'hydromedusae_haliscera_small_sideview',
 u'hydromedusae_liriope',
 u'hydromedusae_narco_dark',
 u'hydromedusae_narcomedusae',
 u'hydromedusae_narco_young',
 u'hydromedusae_other',
 u'hydromedusae_partial_dark',
 u'hydromedusae_shapeA',
 u'hydromedusae_shapeA_sideview_small',
 u'hydromedusae_shapeB',
 u'hydromedusae_sideview_big',
 u'hydromedusae_solmaris',
 u'hydromedusae_solmundella',
 u'hydromedusae_typeD',
 u'hydromedusae_typeD_bell_and_tentacles',
 u'hydromedusae_typeE',
 u'hydromedusae_typeF',
 u'invertebrate_larvae_other_A',
 u'invertebrate_larvae_other_B',
 u'jellies_tentacles',
 u'polychaete',
 u'protist_dark_center',
 u'protist_fuzzy_olive',
 u'protist_noctiluca',
 u'protist_other',
 u'protist_star',
 u'pteropod_butterfly',
 u'pteropod_theco_dev_seq',
 u'pteropod_triangle',
 u'radiolarian_chain',
 u'radiolarian_colony',
 u'shrimp_caridean',
 u'shrimp-like_other',
 u'shrimp_sergestidae',
 u'shrimp_zoea',
 u'siphonophore_calycophoran_abylidae',
 u'siphonophore_calycophoran_rocketship_adult',
 u'siphonophore_calycophoran_rocketship_young',
 u'siphonophore_calycophoran_sphaeronectes',
 u'siphonophore_calycophoran_sphaeronectes_stem',
 u'siphonophore_calycophoran_sphaeronectes_young',
 u'siphonophore_other_parts',
 u'siphonophore_partial',
 u'siphonophore_physonect',
 u'siphonophore_physonect_young',
 u'stomatopod',
 u'tornaria_acorn_worm_larvae',
 u'trichodesmium_bowtie',
 u'trichodesmium_multiple',
 u'trichodesmium_puff',
 u'trichodesmium_tuft',
 u'trochophore_larvae',
 u'tunicate_doliolid',
 u'tunicate_doliolid_nurse',
 u'tunicate_partial',
 u'tunicate_salp',
 u'tunicate_salp_chains',
 u'unknown_blobs_and_smudges',
 u'unknown_sticks',
 u'unknown_unclassified']

In [39]:
marked_taxonomy = neukrill_net.stacked.propagate_labels_to_leaves(neukrill_net.taxonomy.taxonomy, settings.classes)

In [40]:
marked_taxonomy


Out[40]:
{'no_class': {'artifacts': 8, 'artifacts_edge': 9},
 'plankton': {'chaetognaths': {'chaetognath_non_sagitta': 10,
   'chaetognath_other': 11,
   'chaetognath_sagitta': 12},
  'chordate_type1': 13,
  'crustaceans': {'amphipods': 3,
   'copepods': {'calanoid': {'copepod_calanoid': 14,
     'copepod_calanoid_eggs': 15,
     'copepod_calanoid_eucalanus': 16,
     'copepod_calanoid_flatheads': 17,
     'copepod_calanoid_frillyAntennae': 18,
     'copepod_calanoid_large': 19,
     'copepod_calanoid_large_side_antennatucked': 20,
     'copepod_calanoid_octomoms': 21,
     'copepod_calanoid_small_longantennae': 22,
     'copepod_other': 26},
    'cyclopoid_copepods': {'copepod_cyclopoid_copilia': 23,
     'oithona': {'copepod_cyclopoid_oithona': 24,
      'copepod_cyclopoid_oithona_eggs': 25}}},
   'crustacean_other': 27,
   'shrimp_like': {'decapods_all': {'decapods': 32,
     'shrimp_caridean': 92,
     'shrimp_sergestidae': 94,
     'shrimp_zoea': 95},
    'euphausiids_all_ages': {'euphausiids': 47, 'euphausiids_young': 48},
    'shrimp-like_other': 93},
   'stomatopod': 106},
  'detritus': {'detritus_blob': 33,
   'detritus_filamentous': 34,
   'detritus_other': 35,
   'fecal_pellet': 49},
  'diatoms': {'diatom_chain_string': 36, 'diatom_chain_tube': 37},
  'fish': {'fish_larvae_deep_body': 50,
   'fish_larvae_leptocephali': 51,
   'fish_larvae_medium_body': 52,
   'fish_larvae_myctophids': 53,
   'fish_larvae_thin_body': 54,
   'fish_larvae_very_thin_body': 55},
  'gastropods': {'heteropod': 56,
   'pteropods': {'pteropod_butterfly': 87,
    'pteropod_theco_dev_seq': 88,
    'pteropod_triangle': 89}},
  'gelatinous zooplankton': {'ctenophores': {'ctenophore_cestid': 28,
    'ctenophore_lobate': 31,
    'cydippid': {'ctenophore_cydippid_no_tentacles': 29,
     'ctenophore_cydippid_tentacles': 30}},
   'ephyra': 46,
   'hydromedusae': {'other_hydromedusae': {'hydromedusae_bell_and_tentacles': 58,
     'hydromedusae_h15': 59,
     'hydromedusae_other': 66,
     'hydromedusae_partial_dark': 67,
     'hydromedusae_shapeA': 68,
     'hydromedusae_shapeA_sideview_small': 69,
     'hydromedusae_shapeB': 70,
     'hydromedusae_sideview_big': 71,
     'hydromedusae_typeD': 74,
     'hydromedusae_typeD_bell_and_tentacles': 75,
     'hydromedusae_typeE': 76,
     'hydromedusae_typeF': 77},
    'sub_hydromedusae1': {'hydromedusae_aglaura': 57,
     'hydromedusae_haliscera': 60,
     'hydromedusae_haliscera_small_sideview': 61,
     'hydromedusae_liriope': 62},
    'sub_hydromedusae2': {'hydromedusae_narco_dark': 63,
     'hydromedusae_narco_young': 65,
     'hydromedusae_narcomedusae': 64,
     'hydromedusae_solmaris': 72,
     'hydromedusae_solmundella': 73}},
   'jellies_tentacles': 80,
   'pelagic_tunicates': {'appendicularians': {'appendicularian_fritillaridae': 4,
     'appendicularian_s_shape': 6,
     'appendicularian_slight_curve': 5,
     'appendicularian_straight': 7},
    'tunicate': {'tunicate_doliolid': 113,
     'tunicate_doliolid_nurse': 114,
     'tunicate_partial': 115,
     'tunicate_salp': 116,
     'tunicate_salp_chains': 117}},
   'siphonophores': {'calycophoran_siphonophores': {'rocketship': {'siphonophore_calycophoran_rocketship_adult': 97,
      'siphonophore_calycophoran_rocketship_young': 98},
     'siphonophore_calycophoran_abylidae': 96,
     'sphaeronectes': {'siphonophore_calycophoran_sphaeronectes': 99,
      'siphonophore_calycophoran_sphaeronectes_stem': 100,
      'siphonophore_calycophoran_sphaeronectes_young': 101}},
    'physonect': {'siphonophore_physonect': 104,
     'siphonophore_physonect_young': 105},
    'siphonophore_other_parts': 102,
    'siphonophore_partial': 103}},
  'other_invert_larvae': {'echinoderm': {'echinoderm_seacucumber_auricularia_larva': 44,
    'pluteus': {'echinoderm_larva_pluteus_brittlestar': 38,
     'echinoderm_larva_pluteus_early': 39,
     'echinoderm_larva_pluteus_typeC': 40,
     'echinoderm_larva_pluteus_urchin': 41,
     'echinopluteus': 45},
    'seastar': {'echinoderm_larva_seastar_bipinnaria': 42,
     'echinoderm_larva_seastar_brachiolaria': 43}},
   'invertebrate_larvae_other_A': 78,
   'invertebrate_larvae_other_B': 79,
   'tornaria_acorn_worm_larvae': 107,
   'trochophore_larvae': 112},
  'polychaete': 81,
  'protists': {'acantharia': {'acantharia_protist': 0,
    'acantharia_protist_big_center': 1,
    'acantharia_protist_halo': 2},
   'protist_noctiluca': 84,
   'radiolarian': {'radiolarian_chain': 90, 'radiolarian_colony': 91},
   'sub_protists': {'protist_dark_center': 82,
    'protist_fuzzy_olive': 83,
    'protist_other': 85,
    'protist_star': 86}},
  'trichodesmium': {'trichodesmium_bowtie': 108,
   'trichodesmium_multiple': 109,
   'trichodesmium_puff': 110,
   'trichodesmium_tuft': 111},
  'unknown': {'unknown_blobs_and_smudges': 118,
   'unknown_sticks': 119,
   'unknown_unclassified': 120}}}

In [41]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=70.759239912
Logloss=1.8479429741
Time=17.3513498306
Accuracy=0.530129219409

With all the features left in


In [42]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=87.1275148392
Logloss=1.91641586977
Time=20.142124176
Accuracy=0.526305379747

Try with a pipline to reduce the number of features at each level


In [44]:
import sklearn.pipeline

In [47]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=68.5872459412
Logloss=1.8722423589
Time=18.8261601925
Accuracy=0.520767405063
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: divide by zero encountered in divide
  f = msb / msw
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: invalid value encountered in divide
  f = msb / msw

In [47]:
base_clf = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, max_depth=20, min_samples_leaf=5, n_jobs=12, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=68.5872459412
Logloss=1.8722423589
Time=18.8261601925
Accuracy=0.520767405063
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: divide by zero encountered in divide
  f = msb / msw
/afs/inf.ed.ac.uk/user/s11/s1145806/Documents/git/neukrill-venv-auto/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:106: RuntimeWarning: invalid value encountered in divide
  f = msb / msw

Logistic Regression


In [48]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)

In [49]:
t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=748.569411993
Accuracy=0.470266350211
Logloss=2.16218685973

In [50]:
clf = sklearn.linear_model.LogisticRegression(random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=145.000400066
Accuracy=0.527030590717
Logloss=1.83400284889

In [55]:
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=123.477735996
Logloss=2.53232405225
Time=0.659505844116
Accuracy=0.448641877637

In [52]:
base_clf = sklearn.linear_model.LogisticRegression(random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=21.4947209358
Logloss=2.04123557253
Time=1.12955403328
Accuracy=0.476331751055

Linear SVC


In [56]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=337.150140047
Accuracy=0.452202004219
Logloss=2.14135559841

In [57]:
clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=86.2040469646
Accuracy=0.553270042194
Logloss=1.7817047324

In [58]:
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=1202.48394012
Logloss=2.14906265613
Time=122.273081064
Accuracy=0.434665084388

In [59]:
base_clf = sklearn.svm.SVC(kernel='linear', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=177.089457035
Logloss=1.91130093883
Time=34.1300561428
Accuracy=0.488528481013

Non-linear SVC

one-vs-one


In [60]:
clf = sklearn.svm.SVC(probability=True, random_state=42)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0)), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=518.949584007
Accuracy=0.494857594937
Logloss=1.82661110623

In [61]:
clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)

my_X = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100).fit_transform(XF.squeeze(0), y)

t0 = time.time()
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
    sklearn.preprocessing.StandardScaler().fit_transform(my_X), y, test_size=0.5, random_state=42)
clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

print("Accuracy={}".format(clf.score(X_test, y_test)))
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, clf.predict_proba(X_test))))


Time=121.488356829
Accuracy=0.522679324895
Logloss=1.70179310087

In [62]:
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_clf)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=446.839751005
Logloss=1.85225221785
Time=184.890357971
Accuracy=0.50039556962

In [63]:
base_clf = sklearn.svm.SVC(kernel='rbf', probability=True, random_state=42)
best_filter = sklearn.feature_selection.SelectKBest(sklearn.feature_selection.f_classif, k=100)

base_pipe = sklearn.pipeline.Pipeline([('filter', best_filter), ('clf', base_clf)])

hier_clf = neukrill_net.stacked.HierarchyClassifier(marked_taxonomy, base_pipe)

my_X = sklearn.preprocessing.StandardScaler().fit_transform(XF.squeeze(0))
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(my_X, y, test_size=0.5, random_state=42)

t0 = time.time()
hier_clf.fit(X_train, y_train)

t1 = time.time()
total = t1-t0
print("Time={}".format(total))

t0 = time.time()
p = hier_clf.predict_proba(X_test)
print("Logloss={}".format(sklearn.metrics.log_loss(y_test, p)))
print("Time={}".format(time.time()-t0))

print("Accuracy={}".format(sklearn.metrics.accuracy_score(y_test,np.argmax(p,1))))


Time=128.495127916
Logloss=1.79123650959
Time=55.6328210831
Accuracy=0.507318037975

In [ ]: